Validation¶
Validation of algorithms and transformation results.
Chap 7. Validação
- Section 7.3 Resultados
- Section 7.3.2 Semantic Annotation
- Classificação de regras operativas, fatos, termos e nomes
- Section 7.3.3 nlp2sbvr
- Section 7.3.2 Semantic Annotation
Google colab¶
%load_ext autoreload
%autoreload 2
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
from google.colab import drive
drive.mount('/content/drive')
!rm -rf cfr2sbvr configuration checkpoint
!git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
%pip install -r cfr2sbvr/code/requirements.txt
!cp -r cfr2sbvr/code/src/configuration .
!cp -r cfr2sbvr/code/src/checkpoint .
!cp -r cfr2sbvr/code/config.colab.yaml config.yaml
DEFAULT_CONFIG_FILE="config.yaml"
else:
DEFAULT_CONFIG_FILE="../config.yaml"
Imports¶
# Standard library imports
import json
import os
import time
from datetime import datetime
from typing import List
# Third-party imports
import logging_setup.main as logging_setup
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mi
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import rules_taxonomy_provider.main as rules_taxonomy_provider
import scipy.stats as stats
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, spearmanr, pearsonr, linregress
from openai import OpenAI
from pydantic import BaseModel, Field
# Local modules
import configuration.main as configuration
import checkpoint.main as checkpoint
from checkpoint.main import (
Document,
DocumentProcessor,
get_all_checkpoints,
restore_checkpoint,
save_checkpoint,
)
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm
from rules_taxonomy_provider.main import RulesTemplateProvider
DEV_MODE = True
if DEV_MODE:
# Development mode
import importlib
importlib.reload(configuration)
importlib.reload(logging_setup)
importlib.reload(checkpoint)
importlib.reload(llm_query)
importlib.reload(rules_taxonomy_provider)
# Ensure plots are displayed inline if using a Jupyter notebook
%matplotlib inline
Settings¶
Configuration¶
# Load configuration
config = configuration.load_config(DEFAULT_CONFIG_FILE)
Logging¶
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])
2024-12-15 01:44:30 - INFO - Logging is set up with daily rotation.
Checkpoints¶
Restore the checkpoint¶
# Restore the checkpoint
# To run after extraction
last_checkpoint = configuration.get_last_filename(
config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
)
logger.info(f"{last_checkpoint=}")
config["DEFAULT_CHECKPOINT_FILE"] = last_checkpoint
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])
2024-12-15 01:44:30 - INFO - last_checkpoint='../data/checkpoints/documents-2024-12-08-10.json' 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-15 01:44:31 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-12-08-10.json.
General functions¶
# Summary statistics
def summary_statistics(df):
return df.describe()
# Token usage analysis
def token_usage_analysis(df):
plt.figure(figsize=(10, 6))
sns.histplot(df['total_tokens'], kde=True, bins=30)
plt.title('Distribution of Total Tokens')
plt.xlabel('Total Tokens')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='doc_type', y='total_tokens', data=df)
plt.title('Total Tokens by Document Type')
plt.xlabel('Document Type')
plt.ylabel('Total Tokens')
plt.xticks(rotation=45)
plt.show()
# Time efficiency analysis
def time_efficiency_analysis(df):
plt.figure(figsize=(10, 6))
sns.histplot(df['elapsed_time'], kde=True, bins=30)
plt.title('Distribution of Elapsed Time')
plt.xlabel('Elapsed Time (seconds)')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(x='elapsed_time', y='tokens_per_second', data=df)
plt.title('Tokens per Second vs Elapsed Time')
plt.xlabel('Elapsed Time (seconds)')
plt.ylabel('Tokens per Second')
plt.show()
# Cost analysis
def cost_analysis(df):
df['cost'] = (df['total_tokens'] / 1_000_000) * df['price_per_million_tokens']
logger.info(f"Total cost: ${df['cost'].sum():.2f}")
plt.figure(figsize=(10, 6))
sns.histplot(df['cost'], kde=True, bins=30)
plt.title('Distribution of Execution Cost')
plt.xlabel('Cost ($)')
plt.ylabel('Frequency')
plt.show()
# Temporal trends analysis
def temporal_analysis(df):
df['created_date'] = df['created'].dt.date
daily_usage = df.groupby('created_date').size()
plt.figure(figsize=(10, 6))
daily_usage.plot()
plt.title('Daily Prompt Executions')
plt.xlabel('Date')
plt.ylabel('Number of Executions')
plt.show()
# Group performance comparison
def group_performance_analysis(df):
plt.figure(figsize=(10, 6))
sns.boxplot(x='model', y='elapsed_time', data=df)
plt.title('Elapsed Time by Model')
plt.xlabel('Model')
plt.ylabel('Elapsed Time (seconds)')
plt.xticks(rotation=45)
plt.show()
# Function to plot histogram for semscore and similarity_score side-by-side
def plot_histogram_side_by_side(df, title, xlabel, output_dir, filename):
plt.figure(figsize=(12, 6))
plt.hist(df["semscore"], bins=20, color="#D55E00", alpha=0.7, label="Semscore", linestyle="--", edgecolor="black")
plt.hist(df["similarity_score"], bins=20, color="#0072B2", alpha=0.7, label="Similarity Score", linestyle="-", edgecolor="black")
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot box plot for semscore and similarity_score side-by-side
def plot_boxplot_side_by_side(df, title, ylabel, output_dir, filename):
plt.figure(figsize=(8, 6))
boxplot = plt.boxplot(
[df["semscore"].dropna(), df["similarity_score"].dropna()],
labels=["Semscore", "Similarity Score"],
patch_artist=True,
boxprops=dict(color="black"),
medianprops=dict(color="black"),
capprops=dict(color="black"),
whiskerprops=dict(color="black"),
)
colors = ["#D55E00", "#0072B2"]
for patch, color in zip(boxplot['boxes'], colors):
patch.set_facecolor(color)
plt.title(title)
plt.ylabel(ylabel)
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Function to plot density plot for semscore and similarity_score side-by-side
def plot_density_side_by_side(df, title, xlabel, output_dir, filename):
plt.figure(figsize=(12, 6))
df["semscore"].plot(kind="kde", color="#D55E00", alpha=0.7, linestyle="--", label="Semscore")
df["similarity_score"].plot(kind="kde", color="#0072B2", alpha=0.7, linestyle="-", label="Similarity Score")
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()
plot_path = os.path.join(output_dir, filename)
plt.savefig(plot_path)
plt.close()
return plot_path
# Updated process_all_elements function
def process_all_elements_updated(element_data, output_dir):
os.makedirs(output_dir, exist_ok=True)
excel_file_path = os.path.join(output_dir, "combined_analysis_results.xlsx")
writer = pd.ExcelWriter(excel_file_path, engine="xlsxwriter")
workbook = writer.book
combined_df_list = []
image_files = []
for element_name, content in element_data.items():
df = pd.DataFrame(content)
numeric_cols = ["semscore", "similarity_score"]
df[numeric_cols] = df[numeric_cols].astype(float)
df["element_type"] = element_name
combined_df_list.append(df)
sheet_name = element_name[:31]
worksheet = workbook.add_worksheet(sheet_name)
writer.sheets[sheet_name] = worksheet
row = 0
# Histograms side-by-side
plot_filename = f"histogram_side_by_side_{element_name}.png"
plot_path = plot_histogram_side_by_side(
df,
f"Histograms of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
# Boxplots side-by-side
plot_filename = f"boxplot_side_by_side_{element_name}.png"
plot_path = plot_boxplot_side_by_side(
df,
f"Boxplots of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
try:
# Density plots side-by-side
plot_filename = f"density_side_by_side_{element_name}.png"
plot_path = plot_density_side_by_side(
df,
f"Density Plots of Semscore and Similarity Score - {element_name}",
"Scores",
output_dir,
plot_filename,
)
worksheet.insert_image(row, 0, plot_path)
row += 20
image_files.append(plot_path)
except Exception as e:
logger.error(f"Error plotting density side by side: {e}")
combined_df = pd.concat(combined_df_list, ignore_index=True)
# Combined Histograms side-by-side
plot_filename = "histogram_side_by_side_combined.png"
plot_path = plot_histogram_side_by_side(
combined_df,
"Combined Histograms of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"] = workbook.add_worksheet("Combined")
writer.sheets["Combined"].insert_image(0, 0, plot_path)
image_files.append(plot_path)
# Combined Boxplots side-by-side
plot_filename = "boxplot_side_by_side_combined.png"
plot_path = plot_boxplot_side_by_side(
combined_df,
"Combined Boxplots of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"].insert_image(25, 0, plot_path)
image_files.append(plot_path)
# Combined Density Plots side-by-side
plot_filename = "density_side_by_side_combined.png"
plot_path = plot_density_side_by_side(
combined_df,
"Combined Density Plots of Semscore and Similarity Score",
"Scores",
output_dir,
plot_filename,
)
writer.sheets["Combined"].insert_image(50, 0, plot_path)
image_files.append(plot_path)
writer.close()
for image_file in image_files:
if os.path.exists(image_file):
os.remove(image_file)
return combined_df
def remove_section_symbol(input_string: str) -> str:
"""
Removes the '§' symbol from the input string and trims whitespace.
Args:
input_string (str): The string from which to remove the '§' symbol.
Returns:
str: The cleaned string without the '§' symbol and leading/trailing whitespace.
Raises:
TypeError: If 'input_string' is not a string.
"""
if not isinstance(input_string, str):
raise TypeError("input_string must be a string")
return input_string.replace("§", "").strip()
def prompt_analysis(raw_data, output_dir):
# Create a DataFrame from the raw data
data = pd.DataFrame(
raw_data,
columns=["filename", "doc_type", "elapsed_time", "usage", "created", "model"],
)
# Transform 'created' to a human-readable datetime format
data["created"] = pd.to_datetime(data["created"], unit="s")
# Extract relevant information from the 'usage' dictionary
data["completion_tokens"] = data["usage"].apply(lambda x: x["completion_tokens"])
data["prompt_tokens"] = data["usage"].apply(lambda x: x["prompt_tokens"])
data["total_tokens"] = data["usage"].apply(lambda x: x["total_tokens"])
# Define a function to get reference model context length
def get_reference_model_context_length(model):
return reference_models.get(
model, 128_000
) # Default to 128,000 if model is unknown
# Define a function to get the price per million tokens
def get_price_per_million_tokens(model):
return price_per_million_tokens.get(
model, 2.50
) # Default to 2.50 if model is unknown
# Add context length and price per million tokens columns
data["reference_context_length"] = data["model"].apply(
get_reference_model_context_length
)
data["price_per_million_tokens"] = data["model"].apply(get_price_per_million_tokens)
# Overall Statistics
total_tokens = data["total_tokens"].sum()
num_samples = len(data)
average_elapsed_time = data["elapsed_time"].mean()
estimated_cost = (
data["total_tokens"] / 1_000_000 * data["price_per_million_tokens"]
).sum()
average_percentage_context_length = (
data["total_tokens"] / data["reference_context_length"]
).mean() * 100
min_created = data["created"].min().strftime("%Y-%m-%d %H:%M:%S")
max_created = data["created"].max().strftime("%Y-%m-%d %H:%M:%S")
# Add filename column to each statistic for origin tracking
filename = file_info["filename"]
# Data and time of the execution
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Create Overall Statistics DataFrame
overall_stats_df = pd.DataFrame(
[
{
"Total Tokens": total_tokens,
"Number of Samples": num_samples,
"Average Elapsed Time (s)": average_elapsed_time,
"Estimated Cost (USD)": estimated_cost,
"Average Percentage of Context Length (%)": average_percentage_context_length,
"Min Created Timestamp": min_created,
"Max Created Timestamp": max_created,
"origin": filename,
"run_at": now,
}
]
)
# Statistics by Sample Type (doc_type)
stats_by_doc_type = (
data.groupby("doc_type")
.agg(
total_tokens=("total_tokens", "sum"),
num_samples=("doc_type", "count"),
average_elapsed_time=("elapsed_time", "mean"),
average_tokens=("total_tokens", "mean"),
estimated_cost=(
"total_tokens",
lambda x: (x.sum() / 1_000_000)
* data.loc[x.index, "price_per_million_tokens"].mean(),
),
average_percentage_context_length=(
"total_tokens",
lambda x: (
x.mean() / data.loc[x.index, "reference_context_length"].mean()
)
* 100,
),
)
.reset_index()
)
stats_by_doc_type["filename"] = filename
stats_by_doc_type["run_at"] = now
# Statistics by Model
stats_by_model = (
data.groupby("model")
.agg(
total_tokens=("total_tokens", "sum"),
num_samples=("model", "count"),
average_elapsed_time=("elapsed_time", "mean"),
average_tokens=("total_tokens", "mean"),
average_percentage_context_length=(
"total_tokens",
lambda x: (x.mean() / get_reference_model_context_length(x.name)) * 100,
),
)
.reset_index()
)
stats_by_model["filename"] = filename
stats_by_model["run_at"] = now
# Add estimated cost and cost columns separately since they require different calculations
def calculate_group_cost(model):
price = get_price_per_million_tokens(model)
total_tokens = data[data["model"] == model]["total_tokens"].sum()
return (total_tokens / 1_000_000) * price
stats_by_model["estimated_cost"] = stats_by_model["model"].apply(
calculate_group_cost
)
stats_by_model["cost"] = stats_by_model["estimated_cost"]
# Calculate Tokens per Second
# Ensure there are no division by zero issues by filtering out zero elapsed times
data = data[data["elapsed_time"] > 0]
data["tokens_per_second"] = data["total_tokens"] / data["elapsed_time"]
# Write the statistics to an Excel file
file_name = os.path.join(output_dir, "prompt-analysis.xlsx")
with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
# Replace the data on each sheet with the new data
overall_stats_df.to_excel(writer, sheet_name="Overall Statistics", index=False)
stats_by_doc_type.to_excel(
writer, sheet_name="Statistics by Sample Type", index=False
)
stats_by_model.to_excel(writer, sheet_name="Statistics by Model", index=False)
additional_stats_df = pd.DataFrame(
[
{
"Average Completion Tokens": data["completion_tokens"].mean(),
"Average Prompt Tokens": data["prompt_tokens"].mean(),
"Average Total Tokens per Sample": data["total_tokens"].mean(),
"Total Elapsed Time (s)": data["elapsed_time"].sum(),
"Average Tokens per Second": data["tokens_per_second"].mean(),
"origin": filename,
"run_at": now,
}
]
)
additional_stats_df.to_excel(
writer, sheet_name="Additional Statistics", index=False
)
data.to_excel(writer, sheet_name="Raw Data", index=False)
# Explanation Page
explanation_data = {
"Sheet Name": [
"Overall Statistics",
"Statistics by Sample Type",
"Statistics by Model",
"Additional Statistics",
"Raw Data",
],
"Description": [
"Summary statistics of the entire dataset, including total tokens, number of samples, average elapsed time, and estimated cost.",
"Statistics broken down by sample type (doc_type), including the total number of tokens and cost estimates for each type.",
"Statistics grouped by the model used, showing token utilization, cost, and elapsed time for each model.",
"Additional aggregated metrics such as average completion tokens, prompt tokens, total tokens per sample, and processing time.",
"The raw data used for generating all the statistics, including individual completions and their details.",
],
"Columns Explained": [
"Total Tokens: Total number of tokens processed. Number of Samples: Total number of samples. Average Elapsed Time (s): Average time taken for processing. Estimated Cost (USD): Estimated cost for token usage. Average Percentage of Context Length (%): Average percentage of used context length. Min and Max Created Timestamp: The time range of the data collected. Origin: Source filename.",
"doc_type: Type of document. total_tokens: Sum of tokens per document type. num_samples: Number of samples of this type. average_elapsed_time: Average time taken per document type. average_tokens: Average tokens per sample. estimated_cost: Estimated cost for tokens of this type. average_percentage_context_length: Average percentage of context length used. filename: Source filename.",
"model: Model name. total_tokens: Total number of tokens used by the model. num_samples: Number of samples processed by the model. average_elapsed_time: Average processing time for the model. average_tokens: Average number of tokens per sample. average_percentage_context_length: Average context length percentage used. filename: Source filename. estimated_cost/cost: Cost for the tokens used by the model.",
"Average Completion Tokens: Average number of completion tokens per sample. Average Prompt Tokens: Average number of prompt tokens per sample. Average Total Tokens per Sample: Average number of total tokens per sample. Total Elapsed Time (s): Total processing time for all samples. Average Tokens per Second: Average number of tokens processed per second. origin: Source filename.",
"filename: Source filename. doc_type: Type of document. elapsed_time: Time taken for each document. usage: Token usage details (completion and prompt). created: Timestamp of creation. model: Model used.",
],
}
explanation_df = pd.DataFrame(explanation_data)
explanation_df.to_excel(writer, sheet_name="Explanation", index=False)
# Display Overall Statistics
overall_stats_df_display = pd.DataFrame(
[
{
"Total Tokens": total_tokens,
"Number of Samples": num_samples,
"Average Elapsed Time (s)": average_elapsed_time,
"Estimated Cost (USD)": estimated_cost,
"Average Percentage of Context Length (%)": average_percentage_context_length,
"Min Created Timestamp": min_created,
"Max Created Timestamp": max_created,
"origin": filename,
"run_at": now,
}
]
)
print("\nOverall Statistics:")
print(overall_stats_df_display.to_string(index=False))
# Display Statistics by Sample Type
print("\nStatistics by Sample Type (doc_type):")
print(stats_by_doc_type.to_string(index=False))
# Display Statistics by Model
print("\nStatistics by Model:")
print(stats_by_model.to_string(index=False))
# Additional Statistics
additional_stats_df_display = pd.DataFrame(
[
{
"Average Completion Tokens": data["completion_tokens"].mean(),
"Average Prompt Tokens": data["prompt_tokens"].mean(),
"Average Total Tokens per Sample": data["total_tokens"].mean(),
"Total Elapsed Time (s)": data["elapsed_time"].sum(),
"Average Tokens per Second": data["tokens_per_second"].mean(),
"origin": filename,
"run_at": now,
}
]
)
print("\nAdditional Statistics:")
print(additional_stats_df_display.to_string(index=False))
return data
# Add similarity_classification based on similarity_score
def classify_similarity(score):
if score == 1.0:
return "identical"
elif score >= 0.9:
return "close-match"
else:
return "not-sure"
# Modify the highlight_similarity function to use three colors
def highlight_similarity(val):
if val == "identical":
color = "green"
elif val == "close-match":
color = "yellow"
else:
color = "red"
return f"background-color: {color}"
def create_df_elements_results(similarity_elements_results):
# Build the dataframe
df_results = pd.DataFrame(similarity_elements_results)
df_results["similarity_classification"] = df_results["similarity_score"].apply(
classify_similarity
)
df_results["classification_match"] = (
df_results["classification_pred"] == df_results["classification_true"]
)
df_results["classification_match_label"] = df_results["classification_match"].map(
{True: "match", False: "mismatch"}
)
df_results["source_match"] = df_results["source_pred"] == df_results["source_true"]
df_results["source_match_label"] = df_results["source_match"].map(
{True: "match", False: "mismatch"}
)
df_results["id_match"] = df_results["id_pred"] == df_results["id_true"]
df_results["id_match_label"] = df_results["id_match"].map(
{True: "match", False: "mismatch"}
)
return df_results
class JudgeStatement(BaseModel):
doc_id: str = Field(..., description="Document ID associated with the statement.")
statement_id: str = Field(
...,
description="A provided string that identifies the statement. e.g., '1', 'Person'.",
)
statement: str = Field(..., description="The statement to be transformed.")
sources: List[str] = Field(..., description="Sources of the statement.")
semscore: float = Field(..., description="just a copy from input semscore.")
similarity_score: float = Field(
...,
description="Similarity score between the original and transformed sentences.",
)
similarity_score_confidence: float = Field(
..., description="Confidence score for the similarity score."
)
transformation_accuracy: float = Field(
..., description="Accuracy score for the transformation."
)
grammar_syntax_accuracy: float = Field(
..., description="Accuracy score for the grammar and syntax."
)
findings: List[str] = Field(..., description="List of findings.")
class JudgeStatements(BaseModel):
JudgeStatements: List[JudgeStatement] = Field(
..., description="List of judge statements."
)
def get_prompts_for_judge(rules, data_dir):
rule_template_provider = RulesTemplateProvider(data_dir)
system_prompts = []
user_prompts = []
for rule in rules:
element_name = rule.get("element_name")
if element_name == ["Term", "Name"]:
statement_key = "definition"
statement_id_key = "signifier"
else:
statement_key = "statement"
statement_id_key = "statement_id"
user_prompt = get_user_prompt_judge_sentence_similarity(element_name, rule)
user_prompts.append(user_prompt)
rule_templates_subtemplates = rule_template_provider.get_rules_template(
rule["templates_ids"]
)
system_prompt = get_system_prompt_judge_sentence_similarity(
rule_templates_subtemplates
)
system_prompts.append(system_prompt)
logger.debug(system_prompt)
logger.debug(user_prompt)
logger.info(f"System prompts for {element_name}s: {len(system_prompts)}")
logger.info(f"User prompts for {element_name}s: {len(user_prompts)}")
return system_prompts, user_prompts, element_name
def evaluate_statement(element_name, user_prompts, system_prompts, manager):
# Initialize an empty list to accumulate all responses
all_responses = []
elapse_times = []
completions = []
# Loop through each pair of user and system prompts with a counter
for index, (user_prompt, system_prompt) in enumerate(
zip(user_prompts, system_prompts), start=1
):
logger.info(f"Processing evaluation prompt {index} for {element_name}.")
logger.debug(system_prompt)
logger.debug(user_prompt)
# Query the language model
response, completion, elapse_time = query_instruct_llm(
system_prompt=system_prompt,
user_prompt=user_prompt,
document_model=JudgeStatements,
llm_model=config["LLM"]["MODEL"],
temperature=config["LLM"]["TEMPERATURE"],
max_tokens=config["LLM"]["MAX_TOKENS"],
)
logger.debug(f"{response}")
# Accumulate the responses in the list
all_responses.extend(response.JudgeStatements)
elapse_times.append(elapse_time)
completions.append(completion.dict())
logger.info(f"Finished processing evaluation {index}.")
logger.info("Waiting 2s before processing the next prompt to avoid rate limits")
time.sleep(2)
# After the loop, create a single Document with all the accumulated responses
doc = Document(
id=f"validation_judge_{element_name.replace(' ', '_')}s",
type="llm_validation",
content=all_responses,
elapsed_times=elapse_times,
completions=completions,
)
manager.add_document(doc)
logger.info(f"{element_name}s: {len(all_responses)}")
return all_responses
def get_embedding(text, model="text-embedding-3-large"):
client = OpenAI()
text = text.replace("\n", " ")
return client.embeddings.create(input=[text], model=model).data[0].embedding
def cosine_similarity(embedding1, embedding2):
# Calcula a similaridade de cosseno entre dois embeddings
embedding1 = np.array(embedding1)
embedding2 = np.array(embedding2)
cos_sim = np.dot(embedding1, embedding2) / (
np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
)
return cos_sim
def compare_sentences(sentence1, sentence2):
# Obtem embeddings para as duas frases
embedding1 = get_embedding(sentence1)
embedding2 = get_embedding(sentence2)
# Calcula a similaridade de cosseno entre os embeddings
# similarity = cosine_similarity(embedding1, embedding2)
similarity = 1 - cosine(embedding1, embedding2)
return similarity
Datasets¶
From section 7.2.4 Datasets
The dataset of the previous algorithm was adjusted with the gold standard dataset. The goal is to reduce the accumulation of errors from one step to the next.
The data adjusted:
- § 275.0-2_P1, § 275.0-2_P2
- § 275.0-5_P1, § 275.0-5_P2
- § 275.0-7_P1, § 275.0-7_P2
True tables¶
There are no true tables to evaluate the transformation, the evaluation depends on the algorithms SEMSCORE and "LLM as a Judge".
Predicted values¶
Get predicted elements from all runs
elements = []
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
for manager, file_info in zip(managers, file_info_list):
# Process documents
processor = DocumentProcessor(manager, merge=True)
# Access processed data
elements.append(
{"pred_facts": processor.get_rules(),
"pred_terms": processor.get_facts(),
"pred_names": processor.get_terms(definition_filter="non_null"),
"pred_operative_rules": processor.get_names(definition_filter="non_null"),
"pred_file_info": file_info}
)
2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-15 01:44:31 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
Set dataset to evaluation and check empty transformed elements
for element_item in elements:
for key in element_item.keys():
if key == "pred_file_info":
continue
empty_transformed_elements = [
item for item in element_item[key] if not item.get("transformed")
]
logger.info(
f'Empty transformed {element_item["pred_file_info"].get("filename")} {key}: {len(empty_transformed_elements)}/{len(element_item[key])}'
)
2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-1.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-10.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-2.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-3.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-4.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-5.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-6.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-7.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-8.json pred_operative_rules: 0/5 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_facts: 0/6 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_terms: 0/16 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_names: 0/28 2024-12-15 01:44:32 - INFO - Empty transformed documents-2024-12-08-9.json pred_operative_rules: 0/5
Check missing values.
for element_item in elements:
for key in element_item.keys():
if key == "pred_file_info":
continue
element_df = pd.DataFrame(element_item[key])
# Check if there are any missing values
if element_df.isnull().any().any():
mi.matrix(element_df, figsize=(10, 5))
plt.title(f'Missing Values for {key} in {element_item["pred_file_info"].get("filename")}')
plt.show() # Ensure the plot displays
else:
logger.info(f'No missing values for {key} in {element_item["pred_file_info"].get("filename")}')
2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-1.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-1.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-1.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-1.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-10.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-10.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-10.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-10.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-2.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-2.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-2.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-2.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-3.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-3.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-3.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-3.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-4.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-4.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-4.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-4.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-5.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-5.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-5.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-5.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-6.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-6.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-6.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-6.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-7.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-7.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-7.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-7.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-8.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-8.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-8.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-8.json 2024-12-15 01:44:32 - INFO - No missing values for pred_facts in documents-2024-12-08-9.json 2024-12-15 01:44:32 - INFO - No missing values for pred_terms in documents-2024-12-08-9.json 2024-12-15 01:44:32 - INFO - No missing values for pred_names in documents-2024-12-08-9.json 2024-12-15 01:44:32 - INFO - No missing values for pred_operative_rules in documents-2024-12-08-9.json
Algorithms¶
Validation of algorithm from section 6.2 Implementation of main components
Source for section 7.3 Results
nlp2sbvr¶
Elements measurements from chapter 7.2.3 Terms, names, facts, and operative rules
Measuring similarity with SEMSCORE¶
Evaluating SEMSCORE (AYNETDINOV;AKBIK, 2024) between the predicted and true statements for each element.
WARNING: Expensive operation!
If the data is available could skip processing evaluation. Operation is expensive, if just need to compile the evaluation, set SKIP to True.
SKIP = True
if not SKIP:
for element_item in elements:
for key in element_item.keys():
if key == "pred_file_info":
continue
for item in element_item[key]:
original_sentence = f'{item.get("statement_id")}: {item.get("statement", item.get("definition"))}'
transformed_sentence = item.get("transformed")
templates_ids = item.get("templates_ids")
element_name = item.get("element_name")
logger.info(f"{original_sentence=}")
logger.info(f"{transformed_sentence=}")
logger.info(f"{templates_ids=}")
logger.info(f"{element_name=}")
logger.info(f"{key=}")
logger.info(f'{element_item["pred_file_info"]=}')
# Remove keys if they exist
for key in [
"explanation",
"confidence",
"subtype_confidence",
"subtype_explanation",
]:
item.pop(key, None) # Using pop with None to avoid KeyError
logger.debug(f"{element_name=}")
# Calculate similarity score
similarity = compare_sentences(original_sentence, transformed_sentence)
# Assign the calculated score to 'semscore'
item["semscore"] = similarity
else:
logger.debug(
f"{item.get('element_name')} already has a semscore: {item['semscore']}"
)
Check if SEMSCORE was calculated.
for element_item in elements:
for key in element_item.keys():
if key == "pred_file_info":
continue
semscore_in_operative_rules = all(
"semscore" in item and item["semscore"] is not None
for item in element_item[key]
)
logger.info(
f'{key} to evaluate: {len(element_item[key])}, semscore was calculated: {semscore_in_operative_rules}'
)
2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_facts to evaluate: 6, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_terms to evaluate: 16, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_names to evaluate: 28, semscore was calculated: True 2024-12-15 01:44:32 - INFO - pred_operative_rules to evaluate: 5, semscore was calculated: True
Evaluation criterias (SHANKAR et al., 2024)¶
Based on the prompt, there are three inferred evaluation criteria that align with the approach proposed by EvalGen (SHANKAR et al., 2024):
Similarity Score
- Given the original_sentence and tranformed_sentence, how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
Transformation Accuracy
- From 0 to 1, how does the "transformed_sentence" reflect the original_sentence with the structure and phrasing provided by the template?
Grammar and Syntax Accuracy
- How is the transformed sentence grammatically correct and syntactically accurate from 0 to 1?
LLM-as-a-judge¶
References of the LLM-as-a-judge approach: (WEI; CHEN; LUO, 2024), (DONG; HU; COLLIER, 2024), (ZHENG et al., 2023)
Prompt engineering¶
System prompt
def get_system_prompt_judge_sentence_similarity(template):
return f"""
# Task
You're an expert in judging sentence similarity and transformation using a template.
These criteria should support the evaluation process by verifying classification accuracy, template application, and transformation fidelity.
Check the criteria and evaluate the output:
1. **Similarity Score**
- Given the statement or definition and tranformed sentence (transformed), how similar are they from 0 to 1? And how confident are you about your estimation from 0 to 1?
2. **Transformation Accuracy**
- From 0 to 1, how does the transformed sentence (transformed) reflect the original sentence (statement or definition) with the structure and phrasing provided by the template and subtemplates?
3. **Grammar and Syntax Accuracy**
- How is the transformed sentence (transformed) grammatically correct and syntactically accurate from 0 to 1?
# Output Format
Record your evaluation in JSON format as follows:
```json
{{
"doc_id": "<Document ID>",
"statement_id": "<Statement ID>",
"sources": ["<source>"],
"similarity_score": <Similarity score>,
"similarity_score_confidence": <Confidence score>,
"transformation_accuracy": <Transformation score>,
"grammar_syntax_accuracy": <Grammar score>,
"findings": ["<Things found during the evaluation and worth to be mentioned>",
"<other things to mention>"
],
"semscore": <original semscore>
}}
```
# Input example
{{
"doc_id": <Document ID>,
"statement_id": <Statement ID>,
"statement or definition": <original sentence>,
"sources": [<source>],
"terms": [
{{"term": <signifier>, "classification": <Proper or Common Noun>}},
...
],
"verb_symbols": <verbs or phrasal verbs>,
"element_name": <name of element: Name, Term, Fact, Fact Type, Operative Rule>,
"transformed": <transformed sentence>,
"type": <type of element: Definitional, Activity, Party, Data>,
"subtype": <subtype of element>,
"templates_ids": ["T8"],
"semscore": <semscore>
}}
# Templates and Subtemplates
{template}
"""
User prompt
def get_user_prompt_judge_sentence_similarity(element_name, rule):
return f"""
# rule data for an element: {element_name}
{json.dumps(rule, indent=2)}
"""
Measuring similarity with LLM Judge¶
Preparing system and user prompts for each element and call the judge.
if not SKIP:
for element_item in elements:
for key in element_item.keys():
if key == "pred_file_info":
continue
system_prompts, user_prompts, element_name = get_prompts_for_judge(
element_item[key], config["DEFAULT_DATA_DIR"]
)
logger.debug(f"{system_prompts=}")
logger.debug(f"{user_prompts=}")
responses = evaluate_statement(
element_name=element_name,
user_prompts=user_prompts,
system_prompts=system_prompts,
manager=manager,
)
# Persist the state to a file
filename=f'{config["DEFAULT_CHECKPOINT_DIR"]}/{element_item["pred_file_info"].get("filename")}'
logger.debug(f"Saving the state to a file for {filename}")
save_checkpoint(filename=filename, manager=manager)
Average similarity score per document 5s.
Elements evaluation¶
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-15 01:44:32 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
eval_operative_rules = []
eval_facts = []
eval_terms = []
eval_names = []
for manager, file_info in zip(managers, file_info_list):
# Process documents
eval_operative_rules.extend(manager.retrieve_document(
"validation_judge_Operative_Rules", "llm_validation"
).content)
eval_names.extend(manager.retrieve_document(
"validation_judge_Names", "llm_validation"
).content)
eval_terms.extend(manager.retrieve_document(
"validation_judge_Terms", "llm_validation"
).content)
eval_facts.extend(manager.retrieve_document(
"validation_judge_Fact_Types", "llm_validation"
).content)
logger.info(f"Operative Rules: {len(eval_operative_rules)}")
logger.info(f"Names: {len(eval_names)}")
logger.info(f"Terms: {len(eval_terms)}")
logger.info(f"Facts: {len(eval_facts)}")
2024-12-15 01:44:32 - INFO - Operative Rules: 60 2024-12-15 01:44:32 - INFO - Names: 50 2024-12-15 01:44:32 - INFO - Terms: 280 2024-12-15 01:44:32 - INFO - Facts: 160
elements_data = {
"Operative_Rules": eval_operative_rules,
"Names": eval_names,
"Terms": eval_terms,
"Fact_Types": eval_facts,
}
for key in elements_data.keys():
logger.info(f"{key}: {len(elements_data[key])}")
2024-12-15 01:44:32 - INFO - Operative_Rules: 60 2024-12-15 01:44:32 - INFO - Names: 50 2024-12-15 01:44:32 - INFO - Terms: 280 2024-12-15 01:44:32 - INFO - Fact_Types: 160
Checking missing data
for element_key in elements_data.keys():
element_df = pd.DataFrame(elements_data[element_key])
mi.matrix(element_df, figsize=(10, 5))
plt.title(f"Missing Values for {element_key}")
Metrics¶
combined_df = process_all_elements_updated(elements_data, config["DEFAULT_OUTPUT_DIR"])
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = plt.boxplot(
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = plt.boxplot(
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = plt.boxplot(
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = plt.boxplot(
/tmp/ipykernel_127502/1859200400.py:21: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = plt.boxplot(
Describing the metrics semscore and similarity_score
combined_df.groupby("element_type")[["semscore", "similarity_score"]].describe()#.to_excel(config["DEFAULT_OUTPUT_DIR"] + "/sem_sim_descriptive_stats.xlsx")
| semscore | similarity_score | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.875031 | 0.054416 | 0.714021 | 0.821412 | 0.883450 | 0.918956 | 0.972246 | 160.0 | 0.920000 | 0.056413 | 0.70 | 0.90 | 0.95 | 0.95 | 0.95 |
| Names | 50.0 | 0.885333 | 0.030958 | 0.810035 | 0.883378 | 0.902060 | 0.904557 | 0.909534 | 50.0 | 0.949000 | 0.007071 | 0.90 | 0.95 | 0.95 | 0.95 | 0.95 |
| Operative_Rules | 60.0 | 0.907990 | 0.019192 | 0.873461 | 0.889514 | 0.910879 | 0.922699 | 0.933172 | 60.0 | 0.903333 | 0.047716 | 0.75 | 0.90 | 0.90 | 0.95 | 0.95 |
| Terms | 280.0 | 0.848829 | 0.077379 | 0.504298 | 0.829878 | 0.851205 | 0.904896 | 0.960378 | 280.0 | 0.920536 | 0.051306 | 0.60 | 0.90 | 0.95 | 0.95 | 1.00 |
See correlation analysis below
Similarity_score and confidence
combined_df.groupby("element_type")[["similarity_score", "similarity_score_confidence"]].describe()
| similarity_score | similarity_score_confidence | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.920000 | 0.056413 | 0.70 | 0.90 | 0.95 | 0.95 | 0.95 | 160.0 | 0.891563 | 0.026435 | 0.7 | 0.90 | 0.9 | 0.9 | 0.90 |
| Names | 50.0 | 0.949000 | 0.007071 | 0.90 | 0.95 | 0.95 | 0.95 | 0.95 | 50.0 | 0.901000 | 0.007071 | 0.9 | 0.90 | 0.9 | 0.9 | 0.95 |
| Operative_Rules | 60.0 | 0.903333 | 0.047716 | 0.75 | 0.90 | 0.90 | 0.95 | 0.95 | 60.0 | 0.879167 | 0.029533 | 0.8 | 0.85 | 0.9 | 0.9 | 0.90 |
| Terms | 280.0 | 0.920536 | 0.051306 | 0.60 | 0.90 | 0.95 | 0.95 | 1.00 | 280.0 | 0.889107 | 0.033024 | 0.8 | 0.85 | 0.9 | 0.9 | 1.00 |
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
lambda group: group["similarity_score"].corr(group["similarity_score_confidence"])
).reset_index(name="correlation")
/tmp/ipykernel_127502/3731941340.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
combined_df.groupby("element_type").apply(
| element_type | correlation | |
|---|---|---|
| 0 | Fact_Types | 0.598877 |
| 1 | Names | -1.000000 |
| 2 | Operative_Rules | 0.591359 |
| 3 | Terms | 0.555577 |
transformation_accuracy and grammar_syntax_accuracy
combined_df.groupby("element_type")[["transformation_accuracy", "grammar_syntax_accuracy"]].describe()
| transformation_accuracy | grammar_syntax_accuracy | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| element_type | ||||||||||||||||
| Fact_Types | 160.0 | 0.866875 | 0.089000 | 0.50 | 0.85 | 0.90 | 0.90 | 0.95 | 160.0 | 0.931250 | 0.088622 | 0.50 | 0.95 | 0.95 | 0.95 | 1.00 |
| Names | 50.0 | 0.919000 | 0.026515 | 0.85 | 0.90 | 0.90 | 0.95 | 0.95 | 50.0 | 0.970000 | 0.024744 | 0.95 | 0.95 | 0.95 | 1.00 | 1.00 |
| Operative_Rules | 60.0 | 0.840000 | 0.073531 | 0.70 | 0.80 | 0.85 | 0.90 | 0.90 | 60.0 | 0.927500 | 0.054792 | 0.70 | 0.95 | 0.95 | 0.95 | 0.95 |
| Terms | 280.0 | 0.879464 | 0.080086 | 0.50 | 0.80 | 0.90 | 0.95 | 1.00 | 280.0 | 0.954821 | 0.064090 | 0.40 | 0.95 | 0.95 | 1.00 | 1.00 |
# Calculate correlation by element_type
combined_df.groupby("element_type").apply(
lambda group: group["transformation_accuracy"].corr(group["grammar_syntax_accuracy"])
).reset_index(name="correlation")
/tmp/ipykernel_127502/1485779397.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
combined_df.groupby("element_type").apply(
| element_type | correlation | |
|---|---|---|
| 0 | Fact_Types | 0.680278 |
| 1 | Names | 0.808757 |
| 2 | Operative_Rules | 0.689936 |
| 3 | Terms | 0.736881 |
Correlation analysis similarity_score and semscore¶
Top 10 lowest semscore
# Make a copy of the DataFrame for further analysis
df_aval = combined_df.copy()
df_similarity = combined_df.copy()
df_agree = combined_df.copy()
Top 10 lowest similarity_score
df_smallest = df_aval.nsmallest(15, ["semscore"])
df_smallest
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 295 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.504298 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 211 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.534698 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 155 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588489 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence maintains the meanin... | Terms |
| 183 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588578 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 323 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588613 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 127 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588633 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 351 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588633 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 239 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588672 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 379 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588672 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 267 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.589421 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 329 | § 275.0-7 | Control | The power, directly or indirectly, to direct t... | [(b)(1), (a)(3)] | 0.685248 | 0.90 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 217 | § 275.0-7 | Control | The power, directly or indirectly, to direct t... | [(b)(1), (a)(3)] | 0.711167 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 245 | § 275.0-7 | Control | The power, directly or indirectly, to direct t... | [(b)(1), (a)(3)] | 0.712439 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 189 | § 275.0-7 | Control | The power, directly or indirectly, to direct t... | [(a)(3), (b)(1)] | 0.712491 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 385 | § 275.0-7 | Control | The power, directly or indirectly, to direct t... | [(b)(1), (a)(3)] | 0.713834 | 0.90 | 0.85 | 0.8 | 0.95 | [The transformed sentence maintains the core m... | Terms |
# Convert the 'sources' column to a string type to allow dropping duplicates
df_aval["sources"] = df_aval["sources"].apply(str)
# Filter the distinct records based on doc_id, statement_id, statement, and sources
df_aval.drop_duplicates(subset=["doc_id", "statement_id", "statement", "sources"]).nsmallest(15, ["semscore"])
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 127 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | ['(c)'] | 0.588633 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 115 | § 275.0-5 | Notice | A publication in the Federal Register indicati... | ['(a)'] | 0.764826 | 0.95 | 0.90 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms |
| 130 | § 275.0-7 | Small business | An investment adviser with assets under manage... | ['(a)'] | 0.766613 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence captures the main id... | Terms |
| 158 | \n§ 275.0-7 | Small business | An investment adviser with assets under manage... | ['(a)'] | 0.766658 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
| 114 | § 275.0-5 | Order disposing of the matter | An order issued after the period of time for s... | ['(b)', '(a)'] | 0.774899 | 0.90 | 0.85 | 0.80 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 142 | § 275.0-5 | Order disposing of the matter | An order issued after the period of time for s... | ['(a)', '(b)'] | 0.775006 | 0.90 | 0.85 | 0.80 | 0.95 | [The transformed sentence maintains the core m... | Terms |
| 414 | § 275.0-7 | 3 | An investment adviser did not have total asset... | ['(a)(2)'] | 0.792294 | 0.95 | 0.90 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Fact_Types |
| 398 | \n§ 275.0-7 | 3 | An investment adviser did not have total asset... | ['(a)(2)'] | 0.792303 | 0.95 | 0.90 | 0.90 | 1.00 | [The transformed sentence accurately reflects ... | Fact_Types |
| 400 | § 275.0-7 | 5 | Control means the power, directly or indirectl... | ['(b)(1)'] | 0.795561 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence maintains the core m... | Fact_Types |
| 131 | § 275.0-7 | Small organization | An investment adviser with assets under manage... | ['(a)'] | 0.810215 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence captures the main id... | Terms |
| 187 | \n§ 275.0-7 | Small organization | An investment adviser with assets under manage... | ['(a)'] | 0.810718 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence captures the main id... | Terms |
| 395 | § 275.0-5 | 5 | For purposes of this rule, an application mean... | ['(d)'] | 0.812092 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Fact_Types |
| 128 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | ['(d)'] | 0.814453 | 0.60 | 0.80 | 0.50 | 0.40 | [The transformed sentence does not accurately ... | Terms |
| 404 | § 275.0-7 | 9 | A person is presumed to control a trust if the... | ['(b)(1)(iv)'] | 0.820486 | 0.80 | 0.90 | 0.70 | 0.60 | [The transformed sentence maintains the core i... | Fact_Types |
| 123 | § 275.0-5 | Reasons | The justification provided by an interested pe... | ['(a)'] | 0.830164 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Terms |
df_similarity['score_difference'] = df_similarity['similarity_score'] - df_similarity['semscore']
df_similarity
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | § 275.0-2 | 3 | The Secretary of the Commission (Secretary) wi... | [(a)(2)] | 0.907310 | 0.90 | 0.90 | 0.85 | 0.95 | [The transformed sentence maintains the core m... | Operative_Rules | -0.007310 |
| 1 | § 275.0-2 | 4 | If the Secretary certifies that the Commission... | [(a)(3)] | 0.922476 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules | -0.022476 |
| 2 | § 275.0-5 | 1 | Notice of the initiation of the proceeding wil... | [(a)] | 0.889516 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence maintains the origin... | Operative_Rules | 0.060484 |
| 3 | § 275.0-5 | 2 | Any interested person may, within the period o... | [(a)] | 0.881079 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence captures the essence... | Operative_Rules | -0.031079 |
| 4 | § 275.0-5 | 3 | An order disposing of the matter will be issue... | [(b)] | 0.933155 | 0.90 | 0.85 | 0.70 | 0.80 | [The transformed sentence uses 'may be issued'... | Operative_Rules | -0.033155 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 545 | § 275.0-7 | 6 | A person is presumed to control a corporation ... | [(b)(1)(i)(A)] | 0.732264 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence maintains the core m... | Fact_Types | 0.117736 |
| 546 | § 275.0-7 | 7 | A person is presumed to control a partnership ... | [(b)(1)(ii)] | 0.889638 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence maintains the origin... | Fact_Types | 0.060362 |
| 547 | § 275.0-7 | 8 | A person is presumed to control a limited liab... | [(b)(1)(iii)] | 0.940801 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence maintains the origin... | Fact_Types | 0.009199 |
| 548 | § 275.0-7 | 9 | A person is presumed to control a trust if the... | [(b)(1)(iv)] | 0.821413 | 0.80 | 0.90 | 0.70 | 0.60 | [The transformed sentence introduces 'by defin... | Fact_Types | -0.021413 |
| 549 | § 275.0-7 | 10 | Total assets means the total assets as shown o... | [(b)(2)] | 0.920697 | 0.95 | 0.90 | 0.90 | 0.95 | [The transformed sentence closely follows the ... | Fact_Types | 0.029303 |
550 rows Ć 12 columns
# Plot the semscore, similarity score, and score difference on the same graph
plt.figure(figsize=(12, 6))
plt.plot(df_similarity.index, df_similarity['semscore'], color='#D55E00', marker='x', linestyle='--', label='Semscore')
plt.plot(df_similarity.index, df_similarity['similarity_score'], color='#0072B2', marker='o', linestyle='-', label='Similarity Score')
plt.title('Semscore, and Similarity Score Across Records')
plt.xlabel('Record Index')
plt.ylabel('Scores')
plt.grid(True)
plt.legend()
plt.show()
# Plot the score difference as a line chart
plt.figure(figsize=(10, 6))
plt.plot(df_similarity.index, df_similarity['score_difference'], marker='o', linestyle='-', label='Score Difference')
plt.title('Score Difference Across Records')
plt.xlabel('Record Index')
plt.ylabel('Score Difference')
plt.grid(True)
plt.legend()
plt.show()
# Create an interactive scatter plot
fig = go.Figure()
marker_map = {
'Operative_Rules': 'circle',
'Names': 'x',
'Terms': 'triangle-up',
'Fact_Types': 'diamond'
}
# Add a trace for each element_type
unique_types = df_similarity['element_type'].unique()
for etype in unique_types:
filtered_data = df_similarity[df_similarity['element_type'] == etype]
fig.add_trace(go.Scatter(
x=filtered_data.index,
y=filtered_data['score_difference'],
mode='lines+markers',
marker=dict(symbol=marker_map[etype]), # Wrap the symbol in a dictionary
name=etype,
visible=True # Ensure all traces are visible initially
))
# Add dropdown to filter by element_type
dropdown_buttons = [
dict(label="All",
method="update",
args=[{"visible": [True] * len(unique_types)}, # Show all traces
{"title": "Score Difference - All Element Types"}]),
]
for i, etype in enumerate(unique_types):
dropdown_buttons.append(
dict(label=etype,
method="update",
args=[{"visible": [j == i for j in range(len(unique_types))]}, # Show only the selected trace
{"title": f"Score Difference - {etype}"}])
)
fig.update_layout(
updatemenus=[
dict(
buttons=dropdown_buttons,
direction="down",
showactive=True,
x=0.1,
y=1.15
)
],
title="Score Difference Across Element Types",
xaxis_title="Record Index",
yaxis_title="Score Difference",
showlegend=True
)
fig.show()
df_agree['score_difference'] = df_agree['similarity_score'] - df_agree['semscore']
# Calculate the required values
agree = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity = (1 - df_agree.loc[df_agree['score_difference'] > 0.01, 'score_difference']).sum()
semscore = (1 - df_agree.loc[df_agree['score_difference'] < 0.01, 'score_difference']).sum()
# Create a new DataFrame with the calculated values
summary_df = pd.DataFrame({
'Metric': ['Agree', 'Similarity', 'Semscore'],
'Value': [agree, similarity, semscore]
})
# Plot the histogram
plt.figure(figsize=(8, 6))
plt.bar(summary_df['Metric'], summary_df['Value'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Display the calculated values
summary_df
| Metric | Value | |
|---|---|---|
| 0 | Agree | 34.000000 |
| 1 | Similarity | 412.242183 |
| 2 | Semscore | 107.580991 |
# Count the occurrences for each metric
agree_count = ((df_agree['score_difference'] >= -0.01) & (df_agree['score_difference'] <= 0.01)).sum()
similarity_count = (df_agree['score_difference'] > 0.01).sum()
semscore_count = (df_agree['score_difference'] < 0.01).sum()
# Create a new DataFrame with the counts
count_summary_df = pd.DataFrame({
'Metric': ['Agree', 'Similarity', 'Semscore'],
'Count': [agree_count, similarity_count, semscore_count]
})
# Plot the histogram for counts
plt.figure(figsize=(8, 6))
plt.bar(count_summary_df['Metric'], count_summary_df['Count'], color=['blue', 'green', 'red'])
plt.title('Histogram of Metric Counts')
plt.xlabel('Metrics')
plt.ylabel('Counts')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Display the calculated counts
count_summary_df
| Metric | Count | |
|---|---|---|
| 0 | Agree | 34 |
| 1 | Similarity | 447 |
| 2 | Semscore | 103 |
# Define the 10% margin
margin = 0.1
# Compute agreement within the +/-10% margin
agreement_margin = ((df_agree['similarity_score'] >= (df_agree['semscore'] - margin)) &
(df_agree['similarity_score'] <= (df_agree['semscore'] + margin))).sum()
# Compute disagreement outside the +/-10% margin
disagreement_margin = len(df_agree) - agreement_margin
# Display the results
agreement_disagreement_summary = pd.DataFrame({
'Metric': ['Agreement', 'Disagreement'],
'Count': [agreement_margin, disagreement_margin]
})
agreement_disagreement_summary
| Metric | Count | |
|---|---|---|
| 0 | Agreement | 412 |
| 1 | Disagreement | 138 |
# Compute proportional agreement within the ±10% margin
df_agree['agreement_proportion'] = 1 - (df_agree['similarity_score'] - df_agree['semscore']).abs() / margin
#df_agree['agreement_proportion'] = df_agree['agreement_proportion'].clip(lower=0) # Clip negative values to 0
# Plot the proportional agreement series
plt.figure(figsize=(12, 6))
plt.plot(df_agree.index, df_agree['agreement_proportion'], marker='o', linestyle='-', label='Proportional Agreement')
plt.title('Proportional Agreement Series')
plt.xlabel('Record Index')
plt.ylabel('Agreement Proportion')
plt.grid(True)
plt.legend()
plt.show()
Agree that something is bad
# Define the threshold for "low" scores
low_threshold = 0.68
# Identify rows where both metrics are below the threshold
low_agreement_df = df_agree[
#(df_agree['similarity_score'] < low_threshold)
(df_agree['semscore'] < low_threshold)
]
# Display the filtered dataframe
low_agreement_df
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | agreement_proportion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 127 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588633 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311367 | -2.113668 |
| 155 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588489 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence maintains the meanin... | Terms | 0.311511 | -2.115114 |
| 183 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588578 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311422 | -2.114224 |
| 211 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.534698 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.315302 | -2.153024 |
| 239 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588672 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311328 | -2.113276 |
| 267 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.589421 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.310579 | -2.105792 |
| 295 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.504298 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.345702 | -2.457016 |
| 323 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588613 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311387 | -2.113867 |
| 351 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588633 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311367 | -2.113668 |
| 379 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588672 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311328 | -2.113276 |
# Add a temporary column for the product
df_agree['product'] = df_agree['similarity_score'] * df_agree['semscore']
# Sort the dataframe by the product column
sorted_df = df_agree.sort_values(by='product', ascending=True)
# Drop the temporary column if it's no longer needed
df_agree = df_agree.drop(columns=['product'])
# Display the sorted dataframe
sorted_df
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | agreement_proportion | product | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 295 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.504298 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.345702 | -2.457016 | 0.428654 |
| 211 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.534698 | 0.85 | 0.90 | 0.8 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.315302 | -2.153024 | 0.454493 |
| 296 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | [(d)] | 0.814386 | 0.60 | 0.80 | 0.5 | 0.40 | [The transformed sentence does not accurately ... | Terms | -0.214386 | -1.143860 | 0.488632 |
| 128 | § 275.0-5 | Order of the Commission | An order issued by the Commission under the Act. | [(d)] | 0.814453 | 0.60 | 0.80 | 0.5 | 0.40 | [The transformed sentence does not accurately ... | Terms | -0.214453 | -1.144528 | 0.488672 |
| 155 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588489 | 0.90 | 0.85 | 0.9 | 0.95 | [The transformed sentence maintains the meanin... | Terms | 0.311511 | -2.115114 | 0.529640 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 197 | § 275.0-2 | Non-resident | An individual, corporation, partnership, or ot... | [(b)(2)] | 0.960137 | 1.00 | 1.00 | 1.0 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.039863 | 0.601373 | 0.960137 |
| 169 | § 275.0-2 | Non-resident | An individual, corporation, partnership, or ot... | [(b)(2)] | 0.960137 | 1.00 | 1.00 | 1.0 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.039863 | 0.601373 | 0.960137 |
| 337 | § 275.0-2 | Non-resident | An individual, corporation, partnership, or ot... | [(b)(2)] | 0.960137 | 1.00 | 1.00 | 1.0 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.039863 | 0.601373 | 0.960137 |
| 113 | § 275.0-2 | Non-resident | An individual, corporation, partnership, or ot... | [(b)(2)] | 0.960142 | 1.00 | 1.00 | 1.0 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.039858 | 0.601421 | 0.960142 |
| 225 | § 275.0-2 | Non-resident | An individual, corporation, partnership, or ot... | [(b)(2)] | 0.960378 | 1.00 | 1.00 | 1.0 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.039622 | 0.603782 | 0.960378 |
550 rows Ć 14 columns
# Display the dataframe with the proportional agreement column
df_agree.sort_values('agreement_proportion', ascending=True)
| doc_id | statement_id | statement | sources | semscore | similarity_score | similarity_score_confidence | transformation_accuracy | grammar_syntax_accuracy | findings | element_type | score_difference | agreement_proportion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 295 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.504298 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.345702 | -2.457016 |
| 211 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.534698 | 0.85 | 0.90 | 0.80 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.315302 | -2.153024 |
| 155 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588489 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence maintains the meanin... | Terms | 0.311511 | -2.115114 |
| 183 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588578 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311422 | -2.114224 |
| 323 | § 275.0-5 | Protection of investors | A consideration for ordering a hearing if it a... | [(c)] | 0.588613 | 0.90 | 0.85 | 0.90 | 0.95 | [The transformed sentence accurately reflects ... | Terms | 0.311387 | -2.113867 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | § 275.0-2 | 3 | The Secretary of the Commission (Secretary) wi... | [(a)(2)] | 0.907310 | 0.90 | 0.90 | 0.85 | 0.95 | [The transformed sentence maintains the core m... | Operative_Rules | -0.007310 | 0.926898 |
| 344 | § 275.0-5 | Period of time | The timeframe specified in the notice during w... | [(a), (b)] | 0.905456 | 0.90 | 0.95 | 0.85 | 0.95 | [The transformed sentence maintains the core m... | Terms | -0.005456 | 0.945445 |
| 116 | § 275.0-5 | Initiation of the proceeding | The process that begins when a notice is publi... | [(a)] | 0.944867 | 0.95 | 0.90 | 0.95 | 1.00 | [The transformed sentence accurately reflects ... | Terms | 0.005133 | 0.948668 |
| 97 | § 275.0-5 | Federal Register | The official journal where the notice of initi... | [(a)] | 0.904537 | 0.90 | 0.95 | 0.85 | 0.95 | [The transformed sentence maintains the core m... | Names | -0.004537 | 0.954633 |
| 48 | § 275.0-2 | 3 | The Secretary of the Commission (Secretary) wi... | [(a)(2)] | 0.897325 | 0.90 | 0.85 | 0.85 | 0.95 | [The transformed sentence maintains the core m... | Operative_Rules | 0.002675 | 0.973246 |
550 rows Ć 13 columns
Correlation analysis using Spearman, Kendall, and Pearson
Kendall
# Compute Kendall's Tau correlation to assess monotonicity
kendall_correlation, p_value_kendall = kendalltau(df_agree['similarity_score'], df_agree['semscore'])
kendall_correlation, p_value_kendall
(0.2616709607635915, 2.5339459779402854e-15)
Spearman
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
spearman_correlation, p_value = spearmanr(df_agree['similarity_score'], df_agree['semscore'])
spearman_correlation, p_value
(0.32938238960280797, 2.200917948508083e-15)
Pearson
# Check if the relationship between similarity_score and semscore is monotonic
# Compute Spearman's rank correlation to assess monotonicity
pearsonr_correlation, p_value = pearsonr(df_agree['similarity_score'], df_agree['semscore'])
pearsonr_correlation, p_value
(0.2573123366775016, 9.114258452255619e-10)
# Calculate the correlation between similarity_score and semscore
correlation = df_agree['similarity_score'].corr(df_agree['semscore'])
correlation
0.2573123366775016
# Dados
x = df_agree['similarity_score']
y = df_agree['semscore']
# Criar o grÔfico de dispersão
plt.figure(figsize=(8, 6)) # Tamanho do grƔfico
plt.scatter(x, y, alpha=0.5, color="blue", edgecolor="k", label="Data points")
# Adicionar linha de tendĆŖncia linear
slope, intercept, _, _, _ = linregress(x, y)
x_line = np.linspace(min(x), max(x), 100)
y_line = slope * x_line + intercept
plt.plot(x_line, y_line, color="red", linewidth=2, label="Linear Trend Line")
# Melhorar a grid com intervalos mais detalhados
plt.grid(visible=True, which='both', linestyle='--', linewidth=0.5)
# Adicionar tĆtulo e rótulos
plt.title('Scatterplot: Similarity Score vs SemScore with Trendline')
plt.xlabel('Similarity Score')
plt.ylabel('SemScore')
plt.legend() # Mostrar legenda
# Mostrar o grƔfico
plt.tight_layout()
plt.show()
A correlation of -0.107 indicates a weak negative linear relationship between the variables, suggesting that as one variable slightly increases, the other tends to decrease marginally. However, the relationship is negligible, indicating little to no linear association. This weak correlation implies that changes in one variable do not reliably predict changes in the other. Furthermore, the low magnitude does not preclude the possibility of a non-linear relationship, which would require alternative methods of analysis for detection.
Prompt analysis¶
Analyze number of tokens from prompts and documents from last checkpoint using gpt-4o as a reference model.
According to OpenAI | models, the maximum number of tokens (context length) for gpt-4o is 128k.
The cost to use gpt-4o is 2.50 USD per 1m tokens in 2024-10-31. Source: OpenAI | pricing.
Extract elapse times and completions from all sessions.
managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])
2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-1.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-2.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-3.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-4.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-5.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-6.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-7.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-8.json 2024-12-15 01:44:39 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-9.json
tokens_eval = {"doc_type": [], "elapsed_times": [], "completions": [], "file_infos": []}
for manager, file_info in zip(managers, file_info_list):
# Process documents
for key in manager.model_dump()["documents"].keys():
if key[1].startswith("llm_"):
doc = manager.retrieve_document(key[0], key[1])
logger.info(f"Processing: {key[0]}, {key[1]}")
elapsed_times = doc.elapsed_times
logger.debug(f"Elapsed time: {elapsed_times}")
completions = doc.completions
logger.debug(f"Completions: {completions}")
tokens_eval["doc_type"].append(key[1])
tokens_eval["elapsed_times"].append(elapsed_times)
tokens_eval["completions"].append(completions)
tokens_eval["file_infos"].append(file_info)
logger.info(f"Executions for evaluation: {len(tokens_eval['doc_type'])}")
2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:39 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:39 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:39 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:39 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-2_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-5_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P1, llm_response 2024-12-15 01:44:40 - INFO - Processing: § 275.0-7_P2, llm_response 2024-12-15 01:44:40 - INFO - Processing: classify_P1, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Operative_rules, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_terms, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_names, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: classify_P2_Definitional_facts, llm_response_classification 2024-12-15 01:44:40 - INFO - Processing: transform_Operative_Rules, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Fact_Types, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Terms, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: transform_Names, llm_response_transform 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Operative_Rules, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Fact_Types, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Terms, llm_validation 2024-12-15 01:44:40 - INFO - Processing: validation_judge_Names, llm_validation 2024-12-15 01:44:40 - INFO - Executions for evaluation: 190
Evaluate
# Constants
reference_models = config["REFERENCE_MODELS"]["MAX_CONTEXT_LENGTH"]
price_per_million_tokens = config["REFERENCE_MODELS"]["PRICE_PER_MILLION_TOKENS"]
# Initialize an empty list to store the raw data
raw_data = []
# Assuming tokens_eval is already defined and contains the necessary data
for doc_type, elapsed_times, completions, file_info in zip(
tokens_eval["doc_type"],
tokens_eval["elapsed_times"],
tokens_eval["completions"],
tokens_eval["file_infos"],
):
for elapsed_time, completion in zip(elapsed_times, completions):
raw_data.append(
(
file_info["filename"],
doc_type,
elapsed_time,
completion["usage"],
completion["created"],
completion["model"],
)
)
prompt_data_df = prompt_analysis(raw_data, config["DEFAULT_OUTPUT_DIR"])
Overall Statistics:
Total Tokens Number of Samples Average Elapsed Time (s) Estimated Cost (USD) Average Percentage of Context Length (%) Min Created Timestamp Max Created Timestamp origin run_at
5472633 1210 4.055949 13.681582 3.533467 2024-11-30 00:08:20 2024-12-15 03:44:41 documents-2024-12-08-9.json 2024-12-15 01:44:40
Statistics by Sample Type (doc_type):
doc_type total_tokens num_samples average_elapsed_time average_tokens estimated_cost average_percentage_context_length filename run_at
llm_response 272850 60 28.951583 4547.500000 0.682125 3.552734 documents-2024-12-08-9.json 2024-12-15 01:44:40
llm_response_classification 370390 50 8.280727 7407.800000 0.925975 5.787344 documents-2024-12-08-9.json 2024-12-15 01:44:40
llm_response_transform 2480340 550 2.239824 4509.709091 6.200850 3.523210 documents-2024-12-08-9.json 2024-12-15 01:44:40
llm_validation 2349053 550 2.772115 4271.005455 5.872632 3.336723 documents-2024-12-08-9.json 2024-12-15 01:44:40
Statistics by Model:
model total_tokens num_samples average_elapsed_time average_tokens average_percentage_context_length filename run_at estimated_cost cost
gpt-4o-2024-08-06 5472633 1210 4.055949 4522.83719 3.533467 documents-2024-12-08-9.json 2024-12-15 01:44:40 13.681583 13.681583
Additional Statistics:
Average Completion Tokens Average Prompt Tokens Average Total Tokens per Sample Total Elapsed Time (s) Average Tokens per Second origin run_at
314.194215 4208.642975 4522.83719 4907.698278 1802.337863 documents-2024-12-08-9.json 2024-12-15 01:44:40
prompt_data_df.describe()
| elapsed_time | created | completion_tokens | prompt_tokens | total_tokens | reference_context_length | price_per_million_tokens | tokens_per_second | |
|---|---|---|---|---|---|---|---|---|
| count | 1210.000000 | 1210 | 1210.000000 | 1210.000000 | 1210.00000 | 1210.0 | 1210.0 | 1210.000000 |
| mean | 4.055949 | 2024-12-11 08:39:27.552892416 | 314.194215 | 4208.642975 | 4522.83719 | 128000.0 | 2.5 | 1802.337863 |
| min | 1.502028 | 2024-11-30 00:08:20 | 131.000000 | 1357.000000 | 1501.00000 | 128000.0 | 2.5 | 127.116341 |
| 25% | 2.005183 | 2024-12-09 01:54:09 | 156.000000 | 2337.000000 | 4597.00000 | 128000.0 | 2.5 | 1056.751444 |
| 50% | 2.466322 | 2024-12-09 01:55:39 | 173.000000 | 4722.000000 | 5127.00000 | 128000.0 | 2.5 | 1891.422901 |
| 75% | 2.913875 | 2024-12-15 03:16:51.750000128 | 200.000000 | 5074.000000 | 5257.00000 | 128000.0 | 2.5 | 2480.162415 |
| max | 43.604876 | 2024-12-15 03:44:41 | 4517.000000 | 8120.000000 | 8590.00000 | 128000.0 | 2.5 | 3466.646635 |
| std | 6.358610 | NaN | 588.970664 | 1557.668782 | 1508.34103 | 0.0 | 0.0 | 857.387664 |
# Running the analysis
stats = summary_statistics(prompt_data_df)
token_usage_analysis(prompt_data_df)
time_efficiency_analysis(prompt_data_df)
cost_analysis(prompt_data_df)
temporal_analysis(prompt_data_df)
group_performance_analysis(prompt_data_df)
2024-12-15 01:44:41 - INFO - Total cost: $13.68
stats
| elapsed_time | created | completion_tokens | prompt_tokens | total_tokens | reference_context_length | price_per_million_tokens | tokens_per_second | |
|---|---|---|---|---|---|---|---|---|
| count | 1210.000000 | 1210 | 1210.000000 | 1210.000000 | 1210.00000 | 1210.0 | 1210.0 | 1210.000000 |
| mean | 4.055949 | 2024-12-11 08:39:27.552892416 | 314.194215 | 4208.642975 | 4522.83719 | 128000.0 | 2.5 | 1802.337863 |
| min | 1.502028 | 2024-11-30 00:08:20 | 131.000000 | 1357.000000 | 1501.00000 | 128000.0 | 2.5 | 127.116341 |
| 25% | 2.005183 | 2024-12-09 01:54:09 | 156.000000 | 2337.000000 | 4597.00000 | 128000.0 | 2.5 | 1056.751444 |
| 50% | 2.466322 | 2024-12-09 01:55:39 | 173.000000 | 4722.000000 | 5127.00000 | 128000.0 | 2.5 | 1891.422901 |
| 75% | 2.913875 | 2024-12-15 03:16:51.750000128 | 200.000000 | 5074.000000 | 5257.00000 | 128000.0 | 2.5 | 2480.162415 |
| max | 43.604876 | 2024-12-15 03:44:41 | 4517.000000 | 8120.000000 | 8590.00000 | 128000.0 | 2.5 | 3466.646635 |
| std | 6.358610 | NaN | 588.970664 | 1557.668782 | 1508.34103 | 0.0 | 0.0 | 857.387664 |